import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns
from matplotlib import pyplot as plt
from plotnine import theme_dark, facet_grid, theme_classic, element_rect, element_line
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap, xlab, scale_x_log10, theme_bw, theme, element_text, theme_dark
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
import plotly
plotly.offline.init_notebook_mode()
data = pd.read_excel('wuhan.xlsx',engine = 'openpyxl')
data["PATIENT_ID"] = data["PATIENT_ID"].fillna(method='ffill')
data
| PATIENT_ID | RE_DATE | age | gender | Admission time | Discharge time | outcome | Hypersensitive cardiac troponinI | hemoglobin | Serum chloride | ... | mean corpuscular hemoglobin | Activation of partial thromboplastin time | High sensitivity C-reactive protein | HIV antibody quantification | serum sodium | thrombocytocrit | ESR | glutamic-pyruvic transaminase | eGFR | creatinine | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 2020-01-31 01:09:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1.0 | 2020-01-31 01:25:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | NaN | 136.0 | NaN | ... | 31.9 | NaN | NaN | NaN | NaN | 0.12 | NaN | NaN | NaN | NaN |
| 2 | 1.0 | 2020-01-31 01:44:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | NaN | NaN | 103.1 | ... | NaN | NaN | 43.1 | NaN | 137.7 | NaN | NaN | 16.0 | 46.6 | 130.0 |
| 3 | 1.0 | 2020-01-31 01:45:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 1.0 | 2020-01-31 01:56:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | 19.9 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6115 | 375.0 | 2020-02-16 11:21:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | 84.9 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6116 | 375.0 | 2020-02-16 12:04:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6117 | 375.0 | 2020-02-16 12:14:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | NaN | NaN | 105.2 | ... | NaN | NaN | 267.0 | NaN | 139.3 | NaN | NaN | 17.0 | 88.6 | 77.0 |
| 6118 | 375.0 | 2020-02-16 14:11:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | NaN | 155.0 | NaN | ... | 31.6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6119 | 375.0 | 2020-02-16 14:37:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | NaN | NaN | NaN | ... | NaN | 35.8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
6120 rows × 81 columns
new_data = data.groupby("PATIENT_ID").mean()
cor = new_data.corr().abs()
cor_target = abs(cor["outcome"])
relevant_features = cor_target[cor_target>0.5]
relevant_features.sort_values()
monocytes(%) 0.520418 eosinophils(%) 0.527309 Platelet count 0.535089 age 0.561788 calcium 0.639765 D-D dimer 0.671596 Prothrombin activity 0.671888 neutrophils count 0.687342 Fibrin degradation products 0.692018 Lactate dehydrogenase 0.694681 albumin 0.722435 High sensitivity C-reactive protein 0.730180 neutrophils(%) 0.751460 (%)lymphocyte 0.765178 outcome 1.000000 Name: outcome, dtype: float64
df = pd.DataFrame(relevant_features).sort_values('outcome')
df
| outcome | |
|---|---|
| monocytes(%) | 0.520418 |
| eosinophils(%) | 0.527309 |
| Platelet count | 0.535089 |
| age | 0.561788 |
| calcium | 0.639765 |
| D-D dimer | 0.671596 |
| Prothrombin activity | 0.671888 |
| neutrophils count | 0.687342 |
| Fibrin degradation products | 0.692018 |
| Lactate dehydrogenase | 0.694681 |
| albumin | 0.722435 |
| High sensitivity C-reactive protein | 0.730180 |
| neutrophils(%) | 0.751460 |
| (%)lymphocyte | 0.765178 |
| outcome | 1.000000 |
fig = px.bar(df.reset_index(), x='outcome', y='index',
hover_data=[], color='outcome',
height=400,)
fig.update_layout(
xaxis={
'title':'Corelation with outcome'},
yaxis={'title':'Blood atribiutes'})
fig.show()
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.7]
relevant_features.sort_values()
albumin 0.722435 High sensitivity C-reactive protein 0.730180 neutrophils(%) 0.751460 (%)lymphocyte 0.765178 outcome 1.000000 Name: outcome, dtype: float64
new_data = new_data.loc[:,relevant_features.index.insert(0,'gender').insert(0,'age')]
new_data
| age | gender | outcome | albumin | neutrophils(%) | (%)lymphocyte | High sensitivity C-reactive protein | |
|---|---|---|---|---|---|---|---|
| PATIENT_ID | |||||||
| 1.0 | 73 | 1 | 0 | 34.480000 | 68.360000 | 22.720000 | 16.433333 |
| 2.0 | 61 | 1 | 0 | 35.950000 | 80.625000 | 13.650000 | 27.400000 |
| 3.0 | 70 | 2 | 0 | 35.766667 | 67.166667 | 26.500000 | 22.950000 |
| 4.0 | 74 | 1 | 0 | 34.000000 | 71.150000 | 18.250000 | 61.350000 |
| 5.0 | 29 | 2 | 0 | 39.850000 | 59.766667 | 30.666667 | 3.900000 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 371.0 | 63 | 1 | 1 | 30.700000 | 79.500000 | 15.000000 | 152.000000 |
| 372.0 | 79 | 1 | 1 | 21.122222 | 93.560000 | 1.650000 | 232.187500 |
| 373.0 | 61 | 2 | 1 | 23.900000 | 89.000000 | 6.300000 | 205.800000 |
| 374.0 | 33 | 1 | 1 | 27.266667 | 95.150000 | 2.300000 | 109.800000 |
| 375.0 | 68 | 1 | 1 | 26.933333 | 87.700000 | 8.300000 | 162.750000 |
375 rows × 7 columns
fig = px.parallel_coordinates(
new_data,
color="outcome",
labels = {"Age":"age",
"Gender":"gender",
"Albium[g/dl]":"albium",
"Neutrophils[%]":"neutrophils(%)",
"Lymphocyte[%]":"(%)lymphocyte",
"HSC":"High sensitivity C-reactive protein",
},
color_continuous_scale=px.colors.diverging.Geyser,
)
fig.update_layout(coloraxis_showscale=False)
# Show the plot
fig.show()
new_data = data[["PATIENT_ID","age","gender","outcome","Lactate dehydrogenase","High sensitivity C-reactive protein","(%)lymphocyte"]]
HSC = new_data.groupby("PATIENT_ID").mean()
HSC
| age | gender | outcome | Lactate dehydrogenase | High sensitivity C-reactive protein | (%)lymphocyte | |
|---|---|---|---|---|---|---|
| PATIENT_ID | ||||||
| 1.0 | 73 | 1 | 0 | 232.000000 | 16.433333 | 22.720000 |
| 2.0 | 61 | 1 | 0 | 450.250000 | 27.400000 | 13.650000 |
| 3.0 | 70 | 2 | 0 | 274.333333 | 22.950000 | 26.500000 |
| 4.0 | 74 | 1 | 0 | 293.500000 | 61.350000 | 18.250000 |
| 5.0 | 29 | 2 | 0 | 187.000000 | 3.900000 | 30.666667 |
| ... | ... | ... | ... | ... | ... | ... |
| 371.0 | 63 | 1 | 1 | 573.000000 | 152.000000 | 15.000000 |
| 372.0 | 79 | 1 | 1 | 383.666667 | 232.187500 | 1.650000 |
| 373.0 | 61 | 2 | 1 | 702.000000 | 205.800000 | 6.300000 |
| 374.0 | 33 | 1 | 1 | 1706.333333 | 109.800000 | 2.300000 |
| 375.0 | 68 | 1 | 1 | 983.333333 | 162.750000 | 8.300000 |
375 rows × 6 columns
plt.figure(figsize=(12,10))
cor = HSC.corr()
sns.heatmap(cor,annot=True,cmap=plt.cm.Reds)
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
HSC['men_women'] = HSC['gender'].map({1: 'Men', 2: 'Women'})
HSC['recovered_dead'] = HSC['outcome'].map({0: 'Recovered', 1: 'Dead'})
brush = alt.selection_interval()
click = alt.selection_multi(encodings=['color'])
scale = alt.Scale(domain=['Recovered','Dead'],range=['#117733','#994455'])
color = alt.Color('recovered_dead:N', scale=scale,title='Outcome')
points = alt.Chart(HSC).mark_point().encode(
alt.X('age:Q',title='Age'),
alt.Y('High sensitivity C-reactive protein:Q',title="High sensitivity C-reactive protein [mg/l]"),
size = alt.Size('men_women:N',title='Gender'),
color=alt.condition(brush, color, alt.value('lightgray')),
tooltip=['men_women:N','recovered_dead:N','age:N','(%)lymphocyte:N','Lactate dehydrogenase:N','High sensitivity C-reactive protein:N']
).add_selection(
brush
).properties(
width=1500,
).transform_filter(
click
)
bars = alt.Chart(HSC).mark_bar().encode(
x='count()',
y=alt.Y('recovered_dead:N',title='Outcome'),
color = alt.condition(click,color,alt.value('lightgrey')),
).add_selection(
click
).transform_filter(brush).properties(
width=800,
)
alt.vconcat(
points,
bars,
data=HSC,
title="High sensitivity C-reactive protein"
)
brush = alt.selection_interval()
click = alt.selection_multi(encodings=['color'])
base = alt.Chart(HSC).mark_point().encode(
y=alt.Y('age:Q',title='Age'),
size = alt.Size('men_women:N',title='Gender'),
color=alt.condition(brush, color, alt.value('lightgray')),
tooltip=['men_women:N','recovered_dead:N','age:N','(%)lymphocyte:N','Lactate dehydrogenase:N','High sensitivity C-reactive protein:N']
).add_selection(
brush
).properties(
width=400,
height=400
).transform_filter(
click
)
# color = alt.Color('gender:N')
bars = alt.Chart(HSC).mark_bar().encode(
x='count():Q',
y=alt.Y('recovered_dead:N',title='Outcome'),
color = alt.condition(click,color,alt.value('lightgrey')),
).add_selection(
click
).transform_filter(brush).properties(
width=400
)
base.encode(x='Lactate dehydrogenase') & bars | base.encode(x='High sensitivity C-reactive protein') & bars | base.encode(x='(%)lymphocyte') & bars
data_1 = data.groupby("PATIENT_ID").mean()
data_1['gender'].replace(1, 'Female',inplace=True)
data_1['gender'].replace(2, 'Male',inplace=True)
data_1['outcome'].replace(0, 'Recovered',inplace=True)
data_1['outcome'].replace(1, 'Dead',inplace=True)
(ggplot(data_1, aes('Urea', 'calcium', color = 'outcome'))
+ geom_point()
+ scale_x_log10()
+ theme_bw()
+ theme(text=element_text(size=12)))
# 21 rows cotain missing values
/home/bartek/.local/lib/python3.8/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 21 rows containing missing values.
<ggplot: (8753215982318)>
#only to not mess with your data
def disc(x):
if x < 25: return 'Young'
elif x < 40: return 'Adult'
elif x < 60: return 'Middle Age'
else: return 'Senior'
age_data = data_1.sort_values(by = ['age'])
age_data['label_age'] = age_data['age'].apply(lambda x: disc(x))
age_data.label_age = pd.Categorical(age_data.label_age, ordered=True, categories = ['Young', 'Adult', 'Middle Age', 'Senior'])
age_data.label_age
PATIENT_ID
157.0 Young
213.0 Young
102.0 Young
200.0 Adult
195.0 Adult
...
247.0 Senior
313.0 Senior
309.0 Senior
290.0 Senior
212.0 Senior
Name: label_age, Length: 375, dtype: category
Categories (4, object): ['Young' < 'Adult' < 'Middle Age' < 'Senior']
from plotnine import theme_dark, facet_grid, theme_classic, element_rect, element_line
(ggplot(age_data)
+ facet_grid(facets = 'label_age ~ gender')
+ aes(x = 'serum sodium',
y = 'thrombocytocrit',
color = 'outcome')
+ geom_point()
+ theme_dark()
+ theme(
panel_grid_minor = element_line(colour = "gray"),
panel_grid_major = element_line(colour = "gray"))
)
/home/bartek/.local/lib/python3.8/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 32 rows containing missing values.
<ggplot: (8753210058512)>
def disc(x):
for i in range(16):
if x > bins[i][0] and x <= bins[i][1]:
return bins[i][0]
bins = [[i, i+5] for i in range(15, 95, 5)]
age_data['disc_age'] = age_data['age'].apply(lambda x: disc(x))
names = age_data.columns
for name in names[3:-3]:
age_data[name].fillna(age_data[name].mean(), inplace = True)
age_data
| age | gender | outcome | Hypersensitive cardiac troponinI | hemoglobin | Serum chloride | Prothrombin time | procalcitonin | eosinophils(%) | Interleukin 2 receptor | ... | High sensitivity C-reactive protein | HIV antibody quantification | serum sodium | thrombocytocrit | ESR | glutamic-pyruvic transaminase | eGFR | creatinine | label_age | disc_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| PATIENT_ID | |||||||||||||||||||||
| 157.0 | 18 | Male | Recovered | 1.900000 | 127.000000 | 103.850000 | 14.600000 | 0.020000 | 0.566667 | 934.595293 | ... | 0.650000 | 0.090000 | 143.500000 | 0.225000 | 4.500000 | 41.000000 | 215.450000 | 12.500000 | Young | 15 |
| 213.0 | 19 | Female | Dead | 12.800000 | 108.000000 | 97.500000 | 16.900000 | 0.130000 | 0.600000 | 934.595293 | ... | 51.900000 | 0.070000 | 134.500000 | 0.214387 | 8.000000 | 11.000000 | 130.800000 | 69.000000 | Young | 15 |
| 102.0 | 22 | Male | Recovered | 1.900000 | 138.000000 | 100.800000 | 15.000000 | 0.030000 | 0.700000 | 582.000000 | ... | 22.600000 | 0.099745 | 140.600000 | 0.200000 | 16.000000 | 19.000000 | 127.900000 | 55.500000 | Young | 20 |
| 200.0 | 25 | Female | Recovered | 765.964278 | 125.219553 | 102.412216 | 15.607362 | 0.880558 | 0.680637 | 934.595293 | ... | 70.413724 | 0.099745 | 140.737974 | 0.214387 | 33.867593 | 38.709738 | 84.037712 | NaN | Adult | 20 |
| 195.0 | 26 | Male | Recovered | 1.900000 | 136.000000 | 98.200000 | 13.900000 | 0.020000 | 0.500000 | 447.000000 | ... | 1.100000 | 0.150000 | 138.200000 | 0.300000 | 4.000000 | 16.000000 | 130.400000 | 48.000000 | Adult | 25 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 247.0 | 90 | Female | Dead | 1382.400000 | 110.250000 | 105.800000 | 20.300000 | 0.183333 | 0.100000 | 934.595293 | ... | 79.150000 | 0.070000 | 138.850000 | 0.072500 | 47.000000 | 12.333333 | 76.666667 | 72.666667 | Senior | 85 |
| 313.0 | 91 | Female | Dead | 15.900000 | 104.500000 | 105.566667 | 15.150000 | 0.115000 | 0.000000 | 1190.000000 | ... | 140.700000 | 0.060000 | 144.566667 | 0.160000 | 60.000000 | 20.000000 | 87.100000 | 54.333333 | Senior | 90 |
| 309.0 | 92 | Female | Dead | 141.600000 | 119.750000 | 116.533333 | 22.400000 | 1.010000 | 0.000000 | 513.000000 | ... | 154.633333 | 0.090000 | 151.533333 | 0.130000 | 39.000000 | 42.666667 | 43.675000 | 132.750000 | Senior | 90 |
| 290.0 | 94 | Male | Dead | 9.900000 | 121.500000 | 97.800000 | 15.400000 | 0.485000 | 0.000000 | 934.595293 | ... | 83.400000 | 0.099745 | 137.900000 | 0.155000 | 47.000000 | 12.000000 | 66.200000 | 69.000000 | Senior | 90 |
| 212.0 | 95 | Female | Dead | 280.700000 | 108.000000 | 109.250000 | 17.800000 | 0.600000 | 1.600000 | 2161.000000 | ... | 78.000000 | 0.070000 | 142.100000 | 0.190000 | 80.000000 | 18.000000 | 26.300000 | 184.000000 | Senior | 90 |
375 rows × 79 columns
fig = px.scatter(age_data, x="hemoglobin", y="eosinophils(%)",
animation_frame="disc_age", #animation_group="gender",
size="Interleukin 2 receptor", hover_data = ["gender"],
symbol = "outcome", symbol_map = {'dead':'x', 'alive':'circle'},
range_x=[60,180], range_y=[-1, 6], size_max = 40, title = 'title')
fig.update_layout({'plot_bgcolor' : 'rgba(224, 236, 244, 0.3)'})
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
#fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.update_layout(legend_title_text=' Outcome:')
fig.show()